{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n\n# Compiling Llama2 using the dynamo backend\n\nThis script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Imports and Model Definition\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom utils import export_llm, generate"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Define the parameters and initialize the model\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "MAX_TOKENS = 32\nDEVICE = torch.device(\"cuda:0\")\n\n# Define the Llama2 model from hugging face\n# kv_cache is not supported in Torch-TRT currently.\n# CPU is used here so that GPU memory is reserved for TRT compilation.\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n    model = (\n        AutoModelForCausalLM.from_pretrained(\n            llama_path, use_cache=False, attn_implementation=\"eager\"\n        )\n        .eval()\n        .half()\n    )\n\ntokenizer = AutoTokenizer.from_pretrained(llama_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Tokenize a sample input prompt and get pytorch model outputs\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "prompt = \"What is dynamic programming?\"\nmodel_inputs = tokenizer(prompt, return_tensors=\"pt\")\ninput_ids = model_inputs.input_ids\n\n# Auto-regressive generation loop for greedy decoding using PyTorch model\n# We use a custom generate function which is very similar to the huggingface one.\npyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Export the llama2 model into an ExportedProgram which is input of TRT compilation\n# To compile the model in FP16, we do the following\n# 1) Cast the model to FP16 via model.half()\n# 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation\n# 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch)\nllama2_ep = export_llm(model, input_ids, max_seq_len=64)\ntrt_model = torch_tensorrt.dynamo.compile(\n    llama2_ep,\n    inputs=[input_ids],\n    enabled_precisions={torch.float32},\n    truncate_double=True,\n    device=DEVICE,\n    disable_tf32=True,\n    use_explicit_typing=True,\n    use_fp32_acc=True,\n)\n\n# Auto-regressive generation loop for greedy decoding using TensorRT model\n# We use a custom generate function which is very similar to the huggingface one.\n# Move inputs to GPU\ninput_ids = input_ids.to(DEVICE)\ntrt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Decode the output sentences of PyTorch and TensorRT\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(\"=============================\")\nprint(\n    \"Pytorch model generated text: \",\n    tokenizer.batch_decode(\n        pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False\n    )[0],\n)\nprint(\"=============================\")\nprint(\n    \"TensorRT model generated text: \",\n    tokenizer.batch_decode(\n        trt_gen_tokens,\n        skip_special_tokens=True,\n        clean_up_tokenization_spaces=False,\n    )[0],\n)\n\n\n# Prompt : What is dynamic programming?\n\n# =============================\n# Pytorch model generated text: Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and\n\n# =============================\n# TensorRT model generated text: Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}